//MatrixMulAvx.cs

using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static Util;
using size_t = nuint;
using m256d = System.Runtime.Intrinsics.Vector256<double>;
using m128d = System.Runtime.Intrinsics.Vector128<double>;

unsafe struct MatrixMulAvx : IMatrixMulSimdSpecific
{
	public size_t BlockRows => 228;
	public size_t BlockCols => 128;
	public size_t KernelRows => 6;
	public size_t KernelCols => 8;

	public void PackBlock(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		PackBlockBy8Cols(dst, src, src_rows, src_cols, src_stride);
	}

	public void PackVPanel(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		PackVPanelBy6Rows(dst, src, src_rows, src_cols, src_stride);
	}

	public void MulKernelComplete(bool add, double* C, double* A, double* B, size_t A_cols, size_t C_stride)
	{
		MulKernel6x8(add, C, A, B, A_cols, C_stride);
	}

	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public void MulKernelIncomplete(bool add, double* C, double* A, double* B, size_t C_rows, size_t C_cols, size_t A_cols, size_t C_stride)
	{
		if (C_cols > 4)
			MulKernelMx8(add, C, A, B, C_rows, C_cols, A_cols, C_stride);
		else
			MulKernelMx4(add, C, A, B, C_rows, C_cols, A_cols, C_stride);
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	public static void PackVPanelBy6Rows(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		size_t i = 0;
		for (; i + 6 <= src_rows; i += 6)
		{
			double* src0 = src + (i + 0) * src_stride;
			double* src1 = src + (i + 1) * src_stride;
			double* src2 = src + (i + 2) * src_stride;
			double* src3 = src + (i + 3) * src_stride;
			double* src4 = src + (i + 4) * src_stride;
			double* src5 = src + (i + 5) * src_stride;

			size_t j = 0;
			for (; j + 4 <= src_cols; j += 4)
			{
				m256d v0 = Avx.LoadVector256(src0 + j);
				m256d v1 = Avx.LoadVector256(src1 + j);
				m256d v2 = Avx.LoadVector256(src2 + j);
				m256d v3 = Avx.LoadVector256(src3 + j);
				m256d v4 = Avx.LoadVector256(src4 + j);
				m256d v5 = Avx.LoadVector256(src5 + j);

				m128d h0, h1, h2, h3;
				Transpose6x4(ref v0, ref v1, ref v2, ref v3, v4, v5,
							 out h0, out h1, out h2, out h3);

				Avx.Store(dst + 6 * 0, v0); Avx.Store(dst + 6 * 0 + 4, h0);
				Avx.Store(dst + 6 * 1, v1); Avx.Store(dst + 6 * 1 + 4, h1);
				Avx.Store(dst + 6 * 2, v2); Avx.Store(dst + 6 * 2 + 4, h2);
				Avx.Store(dst + 6 * 3, v3); Avx.Store(dst + 6 * 3 + 4, h3);
				dst += 6 * 4;
			}

			for (; j < src_cols; ++j)
			{
				*(dst + 0) = *(src0 + j);
				*(dst + 1) = *(src1 + j);
				*(dst + 2) = *(src2 + j);
				*(dst + 3) = *(src3 + j);
				*(dst + 4) = *(src4 + j);
				*(dst + 5) = *(src5 + j);
				dst += 6;
			}
		}
		if (i < src_rows)
		{
			size_t src_rows_rem = src_rows - i;
			double* src0 = src + (i + 0) * src_stride;
			double* src1 = src + (i + 1) * src_stride;
			double* src2 = src + (i + 2) * src_stride;
			double* src3 = src + (i + 3) * src_stride;
			double* src4 = src + (i + 4) * src_stride;
			for (size_t j = 0; j < src_cols; ++j)
			{
				*(dst + 0) = *(src0 + j); if (src_rows_rem == 1) goto L_next_col;
				*(dst + 1) = *(src1 + j); if (src_rows_rem == 2) goto L_next_col;
				*(dst + 2) = *(src2 + j); if (src_rows_rem == 3) goto L_next_col;
				*(dst + 3) = *(src3 + j); if (src_rows_rem == 4) goto L_next_col;
				*(dst + 4) = *(src4 + j);
			L_next_col:
				dst += src_rows_rem;
			}
		}
	}

	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	static void Transpose6x4(ref m256d m0, ref m256d m1, ref m256d m2, ref m256d m3, m256d m4, m256d m5,
							 out m128d h0, out m128d h1, out m128d h2, out m128d h3)
	{
		m256d a0a1a2a3 = m0;
		m256d b0b1b2b3 = m1;
		m256d c0c1c2c3 = m2;
		m256d d0d1d2d3 = m3;
		m256d e0e1e2e3 = m4;
		m256d f0f1f2f3 = m5;

		m256d a0b0a2b2 = Avx.UnpackLow(a0a1a2a3, b0b1b2b3);  // p5 l1
		m256d a1b1a3b3 = Avx.UnpackHigh(a0a1a2a3, b0b1b2b3);  // p5 l1
		m256d c0d0c2d2 = Avx.UnpackLow(c0c1c2c3, d0d1d2d3);  // p5 l1
		m256d c1d1c3d3 = Avx.UnpackHigh(c0c1c2c3, d0d1d2d3);  // p5 l1

		m256d a2b2c0d0 = Avx.Permute2x128(a0b0a2b2, c0d0c2d2, 0x21);  // p5 l3
		m256d a3b3c1d1 = Avx.Permute2x128(a1b1a3b3, c1d1c3d3, 0x21);  // p5 l3

		m256d a0b0c0d0 = Avx.Blend(a0b0a2b2, a2b2c0d0, 0b1100);  // p015 l1
		m256d a1b1c1d1 = Avx.Blend(a1b1a3b3, a3b3c1d1, 0b1100);  // p015 l1
		m256d a2b2c2d2 = Avx.Blend(a2b2c0d0, c0d0c2d2, 0b1100);  // p015 l1
		m256d a3b3c3d3 = Avx.Blend(a3b3c1d1, c1d1c3d3, 0b1100);  // p015 l1

		m256d e0f0e2f2 = Avx.UnpackLow(e0e1e2e3, f0f1f2f3);  // p5 l1
		m256d e1f1e3f3 = Avx.UnpackHigh(e0e1e2e3, f0f1f2f3);  // p5 l1

		m128d e0f0 = e0f0e2f2.GetLower();
		m128d e1f1 = e1f1e3f3.GetLower();
		m128d e2f2 = e0f0e2f2.GetUpper();  // p5 l3
		m128d e3f3 = e1f1e3f3.GetUpper();  // p5 l3

		m0 = a0b0c0d0;
		m1 = a1b1c1d1;
		m2 = a2b2c2d2;
		m3 = a3b3c3d3;
		h0 = e0f0;
		h1 = e1f1;
		h2 = e2f2;
		h3 = e3f3;
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	public static void PackBlockBy8Cols(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		size_t packed_rows = (src_cols + 7) / 8;
		double* dst_last_row = dst + (packed_rows - 1) * src_rows * 8;
		size_t last_row_step = src_cols % 8 <= 4 ? 4u : 8u;

		for (size_t i = 0; i < src_rows; ++i)
		{
			double* src_row = src + i * src_stride;
			double* dst_col = dst + i * 8;

			size_t j = 0;
			for (; j + 8 <= src_cols; j += 8)
			{
				m256d v0 = Avx.LoadVector256(src_row + j + 0);
				m256d v1 = Avx.LoadVector256(src_row + j + 4);
				Avx.Store(dst_col + 0, v0);
				Avx.Store(dst_col + 4, v1);
				dst_col += src_rows * 8;
			}

			if (j < src_cols)
			{
				for (size_t j2 = 0; j2 < last_row_step; ++j2, ++j)
					dst_last_row[j2] = j < src_cols ? src_row[j] : 0;

				dst_last_row += last_row_step;
			}
		}
	}


	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	void MulKernel6x8(bool add, double* C, double* A, double* B, size_t A_cols, size_t C_stride)
	{
		m256d sum00, sum01, sum10, sum11,
			  sum20, sum21, sum30, sum31,
			  sum40, sum41, sum50, sum51;

		sum00 = sum01 = sum10 = sum11 = sum20 = sum21
			  = sum30 = sum31 = sum40 = sum41 = sum50 = sum51
			  = m256d.Zero;

		double* a0 = A, b0 = B;
		for (size_t k = 0; k < A_cols; ++k)
		{
			const size_t prefetch_offset = 64 * 6;
			Sse.Prefetch0((byte*)a0 + prefetch_offset);
			Sse.Prefetch0((byte*)b0 + prefetch_offset);

			m256d va, vb0, vb1;
			vb0 = Avx.LoadVector256(b0);
			vb1 = Avx.LoadVector256(b0 + 4);

			va = Avx.BroadcastScalarToVector256(a0 + 0);
			sum00 = Avx.Add(sum00, Avx.Multiply(vb0, va));
			sum01 = Avx.Add(sum01, Avx.Multiply(vb1, va));

			va = Avx.BroadcastScalarToVector256(a0 + 1);
			sum10 = Avx.Add(sum10, Avx.Multiply(vb0, va));
			sum11 = Avx.Add(sum11, Avx.Multiply(vb1, va));

			va = Avx.BroadcastScalarToVector256(a0 + 2);
			sum20 = Avx.Add(sum20, Avx.Multiply(vb0, va));
			sum21 = Avx.Add(sum21, Avx.Multiply(vb1, va));

			va = Avx.BroadcastScalarToVector256(a0 + 3);
			sum30 = Avx.Add(sum30, Avx.Multiply(vb0, va));
			sum31 = Avx.Add(sum31, Avx.Multiply(vb1, va));

			va = Avx.BroadcastScalarToVector256(a0 + 4);
			sum40 = Avx.Add(sum40, Avx.Multiply(vb0, va));
			sum41 = Avx.Add(sum41, Avx.Multiply(vb1, va));

			va = Avx.BroadcastScalarToVector256(a0 + 5);
			sum50 = Avx.Add(sum50, Avx.Multiply(vb0, va));
			sum51 = Avx.Add(sum51, Avx.Multiply(vb1, va));

			a0 += 6;
			b0 += 8;
		}

		double* c0, c1, c2, c3, c4, c5;
		c0 = C + C_stride * 0;
		c1 = C + C_stride * 1;
		c2 = C + C_stride * 2;
		c3 = C + C_stride * 3;
		c4 = C + C_stride * 4;
		c5 = C + C_stride * 5;

		if (add)
		{
			sum00 = Avx.Add(sum00, Avx.LoadVector256(c0 + 0));
			sum01 = Avx.Add(sum01, Avx.LoadVector256(c0 + 4));
			sum10 = Avx.Add(sum10, Avx.LoadVector256(c1 + 0));
			sum11 = Avx.Add(sum11, Avx.LoadVector256(c1 + 4));
			sum20 = Avx.Add(sum20, Avx.LoadVector256(c2 + 0));
			sum21 = Avx.Add(sum21, Avx.LoadVector256(c2 + 4));
			sum30 = Avx.Add(sum30, Avx.LoadVector256(c3 + 0));
			sum31 = Avx.Add(sum31, Avx.LoadVector256(c3 + 4));
			sum40 = Avx.Add(sum40, Avx.LoadVector256(c4 + 0));
			sum41 = Avx.Add(sum41, Avx.LoadVector256(c4 + 4));
			sum50 = Avx.Add(sum50, Avx.LoadVector256(c5 + 0));
			sum51 = Avx.Add(sum51, Avx.LoadVector256(c5 + 4));
		}
		Avx.Store(c0 + 0, sum00);
		Avx.Store(c0 + 4, sum01);
		Avx.Store(c1 + 0, sum10);
		Avx.Store(c1 + 4, sum11);
		Avx.Store(c2 + 0, sum20);
		Avx.Store(c2 + 4, sum21);
		Avx.Store(c3 + 0, sum30);
		Avx.Store(c3 + 4, sum31);
		Avx.Store(c4 + 0, sum40);
		Avx.Store(c4 + 4, sum41);
		Avx.Store(c5 + 0, sum50);
		Avx.Store(c5 + 4, sum51);
	}


	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	void MulKernelMx8(bool add, double* C, double* A, double* B, size_t C_rows, size_t C_cols, size_t A_cols, size_t C_stride)
	{
		m256d sum00, sum01, sum10, sum11,
			  sum20, sum21, sum30, sum31,
			  sum40, sum41, sum50, sum51;

		sum00 = sum01 = sum10 = sum11 = sum20 = sum21
			= sum30 = sum31 = sum40 = sum41 = sum50 = sum51
			= m256d.Zero;

		size_t k = 0;
		{
			goto loop_check;
		loop_update:
			k += 1;
		loop_check:
			if (k >= A_cols) goto loop_end;

			double* a0 = A, b0 = B;
			A += C_rows;
			B += 8;
			m256d va, vb0, vb1;
			vb0 = Avx.LoadVector256(b0);
			vb1 = Avx.LoadVector256(b0 + 4);

			va = Avx.BroadcastScalarToVector256(a0 + 0);
			sum00 = Avx.Add(sum00, Avx.Multiply(vb0, va));
			sum01 = Avx.Add(sum01, Avx.Multiply(vb1, va));
			if (C_rows == 1) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 1);
			sum10 = Avx.Add(sum10, Avx.Multiply(vb0, va));
			sum11 = Avx.Add(sum11, Avx.Multiply(vb1, va));
			if (C_rows == 2) goto loop_update;


			va = Avx.BroadcastScalarToVector256(a0 + 2);
			sum20 = Avx.Add(sum20, Avx.Multiply(vb0, va));
			sum21 = Avx.Add(sum21, Avx.Multiply(vb1, va));
			if (C_rows == 3) goto loop_update;


			va = Avx.BroadcastScalarToVector256(a0 + 3);
			sum30 = Avx.Add(sum30, Avx.Multiply(vb0, va));
			sum31 = Avx.Add(sum31, Avx.Multiply(vb1, va));
			if (C_rows == 4) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 4);
			sum40 = Avx.Add(sum40, Avx.Multiply(vb0, va));
			sum41 = Avx.Add(sum41, Avx.Multiply(vb1, va));
			if (C_rows == 5) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 5);
			sum50 = Avx.Add(sum50, Avx.Multiply(vb0, va));
			sum51 = Avx.Add(sum51, Avx.Multiply(vb1, va));
			goto loop_update;
		loop_end:;
		}

		double* c0, c1, c2, c3, c4, c5;
		c0 = C + C_stride * 0;
		c1 = C + C_stride * 1;
		c2 = C + C_stride * 2;
		c3 = C + C_stride * 3;
		c4 = C + C_stride * 4;
		c5 = C + C_stride * 5;
		if (C_cols == 8)
		{
			while (add)
			{
				sum00 = Avx.Add(sum00, Avx.LoadVector256(c0 + 0));
				sum01 = Avx.Add(sum01, Avx.LoadVector256(c0 + 4)); if (C_rows == 1) break;
				sum10 = Avx.Add(sum10, Avx.LoadVector256(c1 + 0));
				sum11 = Avx.Add(sum11, Avx.LoadVector256(c1 + 4)); if (C_rows == 2) break;
				sum20 = Avx.Add(sum20, Avx.LoadVector256(c2 + 0));
				sum21 = Avx.Add(sum21, Avx.LoadVector256(c2 + 4)); if (C_rows == 3) break;
				sum30 = Avx.Add(sum30, Avx.LoadVector256(c3 + 0));
				sum31 = Avx.Add(sum31, Avx.LoadVector256(c3 + 4)); if (C_rows == 4) break;
				sum40 = Avx.Add(sum40, Avx.LoadVector256(c4 + 0));
				sum41 = Avx.Add(sum41, Avx.LoadVector256(c4 + 4)); if (C_rows == 5) break;
				sum50 = Avx.Add(sum50, Avx.LoadVector256(c5 + 0));
				sum51 = Avx.Add(sum51, Avx.LoadVector256(c5 + 4)); break;
			}

			while (true)
			{
				Avx.Store(c0 + 0, sum00);
				Avx.Store(c0 + 4, sum01); if (C_rows == 1) break;
				Avx.Store(c1 + 0, sum10);
				Avx.Store(c1 + 4, sum11); if (C_rows == 2) break;
				Avx.Store(c2 + 0, sum20);
				Avx.Store(c2 + 4, sum21); if (C_rows == 3) break;
				Avx.Store(c3 + 0, sum30);
				Avx.Store(c3 + 4, sum31); if (C_rows == 4) break;
				Avx.Store(c4 + 0, sum40);
				Avx.Store(c4 + 4, sum41); if (C_rows == 5) break;
				Avx.Store(c5 + 0, sum50);
				Avx.Store(c5 + 4, sum51); break;
			}
		}
		else
		{
			m256d mask = GetMask(C_cols % 4);
			while (add)
			{
				sum00 = Avx.Add(sum00, Avx.LoadVector256(c0)); sum01 = Avx.Add(sum01, Avx.MaskLoad(c0 + 4, mask)); if (C_rows == 1) break;
				sum10 = Avx.Add(sum10, Avx.LoadVector256(c1)); sum11 = Avx.Add(sum11, Avx.MaskLoad(c1 + 4, mask)); if (C_rows == 2) break;
				sum20 = Avx.Add(sum20, Avx.LoadVector256(c2)); sum21 = Avx.Add(sum21, Avx.MaskLoad(c2 + 4, mask)); if (C_rows == 3) break;
				sum30 = Avx.Add(sum30, Avx.LoadVector256(c3)); sum31 = Avx.Add(sum31, Avx.MaskLoad(c3 + 4, mask)); if (C_rows == 4) break;
				sum40 = Avx.Add(sum40, Avx.LoadVector256(c4)); sum41 = Avx.Add(sum41, Avx.MaskLoad(c4 + 4, mask)); if (C_rows == 5) break;
				sum50 = Avx.Add(sum50, Avx.LoadVector256(c5)); sum51 = Avx.Add(sum51, Avx.MaskLoad(c5 + 4, mask)); break;
			}

			while (true)
			{
				Avx.Store(c0, sum00); Avx.MaskStore(c0 + 4, mask, sum01); if (C_rows == 1) break;
				Avx.Store(c1, sum10); Avx.MaskStore(c1 + 4, mask, sum11); if (C_rows == 2) break;
				Avx.Store(c2, sum20); Avx.MaskStore(c2 + 4, mask, sum21); if (C_rows == 3) break;
				Avx.Store(c3, sum30); Avx.MaskStore(c3 + 4, mask, sum31); if (C_rows == 4) break;
				Avx.Store(c4, sum40); Avx.MaskStore(c4 + 4, mask, sum41); if (C_rows == 5) break;
				Avx.Store(c5, sum50); Avx.MaskStore(c5 + 4, mask, sum51); break;
			}
		}
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	void MulKernelMx4(bool add, double* C, double* A, double* B, size_t C_rows, size_t C_cols, size_t A_cols, size_t C_stride)
	{
		m256d sum00, sum10, sum20, sum30, sum40, sum50;

		sum00 = sum10 = sum20 = sum30 = sum40 = sum50
			= m256d.Zero;

		size_t k = 0;
		{
			goto loop_check;
		loop_update:
			k += 1;
		loop_check:
			if (k >= A_cols) goto loop_end;

			double* a0 = A, b0 = B;
			A += C_rows;
			B += 4;
			m256d va, vb0;
			vb0 = Avx.LoadVector256(b0);

			va = Avx.BroadcastScalarToVector256(a0 + 0);
			sum00 = Avx.Add(sum00, Avx.Multiply(vb0, va));
			if (C_rows == 1) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 1);
			sum10 = Avx.Add(sum10, Avx.Multiply(vb0, va));
			if (C_rows == 2) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 2);
			sum20 = Avx.Add(sum20, Avx.Multiply(vb0, va));
			if (C_rows == 3) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 3);
			sum30 = Avx.Add(sum30, Avx.Multiply(vb0, va));
			if (C_rows == 4) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 4);
			sum40 = Avx.Add(sum40, Avx.Multiply(vb0, va));
			if (C_rows == 5) goto loop_update;

			va = Avx.BroadcastScalarToVector256(a0 + 5);
			sum50 = Avx.Add(sum50, Avx.Multiply(vb0, va));
			goto loop_update;
		loop_end:;
		}

		double* c0, c1, c2, c3, c4, c5;
		c0 = C + C_stride * 0;
		c1 = C + C_stride * 1;
		c2 = C + C_stride * 2;
		c3 = C + C_stride * 3;
		c4 = C + C_stride * 4;
		c5 = C + C_stride * 5;
		if (C_cols == 4)
		{
			while (add)
			{
				sum00 = Avx.Add(sum00, Avx.LoadVector256(c0)); if (C_rows == 1) break;
				sum10 = Avx.Add(sum10, Avx.LoadVector256(c1)); if (C_rows == 2) break;
				sum20 = Avx.Add(sum20, Avx.LoadVector256(c2)); if (C_rows == 3) break;
				sum30 = Avx.Add(sum30, Avx.LoadVector256(c3)); if (C_rows == 4) break;
				sum40 = Avx.Add(sum40, Avx.LoadVector256(c4)); if (C_rows == 5) break;
				sum50 = Avx.Add(sum50, Avx.LoadVector256(c5)); break;
			}

			while (true)
			{
				Avx.Store(c0, sum00); if (C_rows == 1) break;
				Avx.Store(c1, sum10); if (C_rows == 2) break;
				Avx.Store(c2, sum20); if (C_rows == 3) break;
				Avx.Store(c3, sum30); if (C_rows == 4) break;
				Avx.Store(c4, sum40); if (C_rows == 5) break;
				Avx.Store(c5, sum50); break;
			}
		}
		else
		{
			m256d mask = GetMask(C_cols);
			while (add)
			{
				sum00 = Avx.Add(sum00, Avx.MaskLoad(c0, mask)); if (C_rows == 1) break;
				sum10 = Avx.Add(sum10, Avx.MaskLoad(c1, mask)); if (C_rows == 2) break;
				sum20 = Avx.Add(sum20, Avx.MaskLoad(c2, mask)); if (C_rows == 3) break;
				sum30 = Avx.Add(sum30, Avx.MaskLoad(c3, mask)); if (C_rows == 4) break;
				sum40 = Avx.Add(sum40, Avx.MaskLoad(c4, mask)); if (C_rows == 5) break;
				sum50 = Avx.Add(sum50, Avx.MaskLoad(c5, mask)); break;
			}

			while (true)
			{
				Avx.MaskStore(c0, mask, sum00); if (C_rows == 1) break;
				Avx.MaskStore(c1, mask, sum10); if (C_rows == 2) break;
				Avx.MaskStore(c2, mask, sum20); if (C_rows == 3) break;
				Avx.MaskStore(c3, mask, sum30); if (C_rows == 4) break;
				Avx.MaskStore(c4, mask, sum40); if (C_rows == 5) break;
				Avx.MaskStore(c5, mask, sum50); break;
			}
		}
	}

	double* mask_arr;   // [ all1, all1, all1, all1, all0, all0, all0, all0 ]

	[MethodImpl(MethodImplOptions.NoInlining)]
	void InitMask()
	{
		mask_arr = CacheAlignAlloc(8);
		Avx.Store(mask_arr + 0, Vector256.Create((long)-1).AsDouble());
		Avx.Store(mask_arr + 4, m256d.Zero);
	}

	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public m256d GetMask(size_t n)
	{
		if (mask_arr == null)
			InitMask();
		return Avx.LoadVector256(mask_arr + 4 - n);
	}

	public void Dispose()
	{
		if (mask_arr == null)
			return;
		CacheAlignFree(mask_arr);
		mask_arr = null;
	}
}
